{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import youtokentome as yttm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "files = ['../pure-text/filtered-dumping-wiki.txt',\n",
    "        '../pure-text/dumping-cleaned-news.txt']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "results = []\n",
    "for file in files:\n",
    "    with open(file) as fopen:\n",
    "        results.extend(list(filter(None, fopen.read().split('\\n'))))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5521509"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(results)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('concat.txt', 'w') as fopen:\n",
    "    fopen.write('\\n'.join(results))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "bpe = yttm.BPE.train(data='concat.txt', vocab_size=32000, model='truecase.yttm',\n",
    "                    pad_id = 0, unk_id = 3, bos_id = 2, eos_id = 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "string = 'Lawatan Putera Mahkota ARab SAudi BinCang pelabuRaN EKonomi KUALA LUMPUR LaWAtAn PUtera MahKotA ARAB Saudi MoHAmMEd bIN SalmAN KE MAlAySia Pada AhAD iNI TEraraH KePaDa perteMuan BaGi membinCaNgkaN Isu EKonomi Dan pelabuRaN AntaRa kedua-duA nEgaRa ! MenTeri LUar[ DatuK SaifuDDIn AbDUllah BeRKata tujuAN LAwatAn KerabaT DIraJA Arab SAudI itU AKan DipeRjElasKan OlEh KemeNteriaN HAl EhwAL EkOnomi )'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 393 µs, sys: 0 ns, total: 393 µs\n",
      "Wall time: 403 µs\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['Lawatan Putera Mahkota ARab SAudi BinCang pelabuRaN EKonomi KUALA LUMPUR LaWAtAn PUtera MahKotA ARAB Saudi MoHAmMEd bIN SalmAN KE MAlAySia Pada AhAD iNI TEraraH KePaDa perteMuan BaGi membinCaNgkaN Isu EKonomi Dan pelabuRaN AntaRa kedua-duA nEgaRa ! MenTeri LUar[ DatuK SaifuDDIn AbDUllah BeRKata tujuAN LAwatAn KerabaT DIraJA Arab SAudI itU AKan DipeRjElasKan OlEh KemeNteriaN HAl EhwAL EkOnomi )']"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "bpe.decode(bpe.encode(string, output_type=yttm.OutputType.ID))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "!rm concat.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import youtokentome as yttm\n",
    "\n",
    "bpe = yttm.BPE(model='truecase.yttm')\n",
    "\n",
    "class Encoder:\n",
    "    def __init__(self, bpe):\n",
    "        self.bpe = bpe\n",
    "        self.vocab_size = len(self.bpe.vocab())\n",
    "    \n",
    "    def encode(self, s):\n",
    "        s = self.bpe.encode(s, output_type=yttm.OutputType.ID)\n",
    "        s = [i + [1] for i in s]\n",
    "        return s\n",
    "    \n",
    "    def decode(self, ids, strip_extraneous=False):\n",
    "        return self.bpe.decode(list(ids))[0]\n",
    "    \n",
    "encoder = Encoder(bpe)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'saya suka mAkan<EOS>'"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "encoder.decode(encoder.encode('saya suka mAkan'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 87 µs, sys: 14 µs, total: 101 µs\n",
      "Wall time: 108 µs\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "([1012, 5049, 623, 31, 641, 1], [1012, 5049, 623, 31, 641, 1])"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "i, o =encoder.encode(['saya suka mAkan', 'saya suka mAkan'])\n",
    "i, o"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 95 µs, sys: 15 µs, total: 110 µs\n",
      "Wall time: 118 µs\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[[1012, 5049, 623, 31, 641, 1]]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "encoder.encode(['saya suka mAkan'])\n",
    "encoder.encode(['saya suka mAkan'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
